Date: Thu Feb 27 22:59:42 2020
Scientist: Ran Yin
Sequencing (Waksman): Dibyendu Kumar (?)
Statistics: Davit Sargsyan
Principal Investigator: Ah-Ng Kong
Sources
Data
FastQ files were downloaded from this Rutgers Box location. A total of 144 files (2 per sample, pair-ended) and a pair of undetermined reads were downloaded.
16s metadata Sep-2019.xlsx meta-data file was created by Ran. It was saved as .CSV file and used by this script.
Load libraries
Questions
- Did microbiome change over time?
- Was microbiome affected by diet?
- Was microbiome affected by KO compared to WT?
FastQ files
# Get FastQ file names----
list.files(path = path,
pattern = ".gz")
[1] "RH01_S1_L001_R1_001.fastq.gz" "RH01_S1_L001_R2_001.fastq.gz" "RH02_S2_L001_R1_001.fastq.gz"
[4] "RH02_S2_L001_R2_001.fastq.gz" "RH03_S3_L001_R1_001.fastq.gz" "RH03_S3_L001_R2_001.fastq.gz"
[7] "RH04_S4_L001_R1_001.fastq.gz" "RH04_S4_L001_R2_001.fastq.gz" "RH05_S5_L001_R1_001.fastq.gz"
[10] "RH05_S5_L001_R2_001.fastq.gz" "RH06_S6_L001_R1_001.fastq.gz" "RH06_S6_L001_R2_001.fastq.gz"
[13] "RH07_S7_L001_R1_001.fastq.gz" "RH07_S7_L001_R2_001.fastq.gz" "RH08_S8_L001_R1_001.fastq.gz"
[16] "RH08_S8_L001_R2_001.fastq.gz" "RH09_S9_L001_R1_001.fastq.gz" "RH09_S9_L001_R2_001.fastq.gz"
[19] "RH10_S10_L001_R1_001.fastq.gz" "RH10_S10_L001_R2_001.fastq.gz" "RH11_S11_L001_R1_001.fastq.gz"
[22] "RH11_S11_L001_R2_001.fastq.gz" "RH12_S12_L001_R1_001.fastq.gz" "RH12_S12_L001_R2_001.fastq.gz"
[25] "RH13_S13_L001_R1_001.fastq.gz" "RH13_S13_L001_R2_001.fastq.gz" "RH14_S14_L001_R1_001.fastq.gz"
[28] "RH14_S14_L001_R2_001.fastq.gz" "RH15_S15_L001_R1_001.fastq.gz" "RH15_S15_L001_R2_001.fastq.gz"
[31] "RH16_S16_L001_R1_001.fastq.gz" "RH16_S16_L001_R2_001.fastq.gz" "RH17_S17_L001_R1_001.fastq.gz"
[34] "RH17_S17_L001_R2_001.fastq.gz" "RH18_S18_L001_R1_001.fastq.gz" "RH18_S18_L001_R2_001.fastq.gz"
[37] "RH19_S19_L001_R1_001.fastq.gz" "RH19_S19_L001_R2_001.fastq.gz" "RH20_S20_L001_R1_001.fastq.gz"
[40] "RH20_S20_L001_R2_001.fastq.gz" "RH21_S21_L001_R1_001.fastq.gz" "RH21_S21_L001_R2_001.fastq.gz"
[43] "RH22_S22_L001_R1_001.fastq.gz" "RH22_S22_L001_R2_001.fastq.gz" "RH23_S23_L001_R1_001.fastq.gz"
[46] "RH23_S23_L001_R2_001.fastq.gz" "RH24_S24_L001_R1_001.fastq.gz" "RH24_S24_L001_R2_001.fastq.gz"
[49] "RH25_S25_L001_R1_001.fastq.gz" "RH25_S25_L001_R2_001.fastq.gz" "RH26_S26_L001_R1_001.fastq.gz"
[52] "RH26_S26_L001_R2_001.fastq.gz" "RH27_S27_L001_R1_001.fastq.gz" "RH27_S27_L001_R2_001.fastq.gz"
[55] "RH28_S28_L001_R1_001.fastq.gz" "RH28_S28_L001_R2_001.fastq.gz" "RH29_S29_L001_R1_001.fastq.gz"
[58] "RH29_S29_L001_R2_001.fastq.gz" "RH30_S30_L001_R1_001.fastq.gz" "RH30_S30_L001_R2_001.fastq.gz"
[61] "RH31_S31_L001_R1_001.fastq.gz" "RH31_S31_L001_R2_001.fastq.gz" "RH32_S32_L001_R1_001.fastq.gz"
[64] "RH32_S32_L001_R2_001.fastq.gz" "RH33_S33_L001_R1_001.fastq.gz" "RH33_S33_L001_R2_001.fastq.gz"
[67] "RH34_S34_L001_R1_001.fastq.gz" "RH34_S34_L001_R2_001.fastq.gz" "RH35_S35_L001_R1_001.fastq.gz"
[70] "RH35_S35_L001_R2_001.fastq.gz" "RH36_S36_L001_R1_001.fastq.gz" "RH36_S36_L001_R2_001.fastq.gz"
Quality of reads
In gray-scale is a heat map of the frequency of each quality score at each base position. The median quality score at each position is shown by the green line, and the quartiles of the quality score distribution by the orange lines. The red line shows the scaled proportion of reads that extend to at least that position (this is more useful for other sequencing technologies, as Illumina reads are typically all the same lenghth, hence the flat red line).
Source: DADA2 Pipeline Tutorial (1.12) NOTE: the reason the quality seems to be low at the beginning is that the program is using moving averages so there are less data points in the beginning. No trimming is needed on the left.
Forward reads
user system elapsed
290.668 22.189 313.520




































Reverse reads
user system elapsed
294.714 26.558 322.720




































Filter and trim sequences
The reads were trimmed approximately to the lenght at which the quality score median (the green line) went below 20.
The forward reads were of a very good quiality. Only last 20 bases were trimmed.
The reverse read were of lower quality and were trimmed at the length of 220 bases.
sample.names <- gsub(x = fnFs,
pattern = "fastq_jan2020/",
replacement = "")
sample.names <- sapply(strsplit(sample.names, "_"),
`[`,
1)
sample.names
[1] "RH01" "RH02" "RH03" "RH04" "RH05" "RH06" "RH07" "RH08" "RH09" "RH10" "RH11" "RH12" "RH13" "RH14" "RH15" "RH16"
[17] "RH17" "RH18" "RH19" "RH20" "RH21" "RH22" "RH23" "RH24" "RH25" "RH26" "RH27" "RH28" "RH29" "RH30" "RH31" "RH32"
[33] "RH33" "RH34" "RH35" "RH36"
filtFs <- gsub(x = fnFs,
pattern = "fastq_jan2020/",
replacement = "filtered_jan2020/")
filtRs <- gsub(x = fnRs,
pattern = "fastq_jan2020/",
replacement = "filtered_jan2020/")
out <- filterAndTrim(fwd = fnFs,
filt = filtFs,
rev = fnRs,
filt.rev = filtRs,
truncLen = c(280, 220),
# trimRight = c(20, 80),
maxN = 0,
maxEE = c(2, 2),
truncQ = 2,
rm.phix = TRUE,
compress = TRUE,
multithread = FALSE)
# NOTE: multi-tread messes up pairs of the files; using single tread instead.
save(out,
file = "data_jan2020/out.RData")
gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 6994016 373.6 12408701 662.7 11889647 635.0
Vcells 11990477 91.5 66625207 508.4 97695243 745.4
Reads after trimming: examples


Learn the error rates
NOTE: parameter learning is computationally intensive, so by default the learnErrors function uses only a subset of the data (the first 1M reads). If the plotted error model does not look like a good fit, try increasing the nreads parameter to see if the fit improves.
# fnFs <- sort(list.files(path,
# pattern="_R1_001.fastq",
# full.names = TRUE))
# filtFs <- gsub(x = fnFs,
# pattern = "fastq_sep2019/",
# replacement = "filtered_sep2019/")
system.time(errF <- learnErrors(filtFs,
multithread = FALSE))
154626080 total bases in 552236 reads from 2 samples will be used for learning the error rates.
user system elapsed
26845.522 442.297 27277.245
save(errF,
file = "data_jan2020/errF.RData")
# fnRs <- sort(list.files(path,
# pattern="_R2_001.fastq",
# full.names = TRUE))
# filtRs <- gsub(x = fnRs,
# pattern = "fastq_sep2019/",
# replacement = "filtered_sep2019/")
system.time(errR <- learnErrors(filtRs,
multithread = FALSE))
save(errR,
file = "data_sep2019/errR.RData")
Plot learn the error rates
Dereplicate the dataset
NOTE: for larger datasets (exceeding available RAM) process samples one-by-one. See DADA2 Workflow on Big Data.
# fnFs <- sort(list.files(path,
# pattern="_R1_001.fastq",
# full.names = TRUE))
# filtFs <- gsub(x = fnFs,
# pattern = "fastq_sep2019/",
# replacement = "filtered_sep2019/")
system.time(derepFs <- derepFastq(filtFs,
verbose = TRUE))
save(derepFs,
file = "data_sep2019/derepFs.RData")
head(derepFs)
gc()
# fnRs <- sort(list.files(path,
# pattern="_R2_001.fastq",
# full.names = TRUE))
# filtRs <- gsub(x = fnRs,
# pattern = "fastq_sep2019/",
# replacement = "filtered_sep2019/")
system.time(derepRs <- derepFastq(filtRs,
verbose = TRUE))
save(derepRs,
file = "data_sep2019/derepRs.RData")
head(derepRs)
gc()
Alignment
Notes from IGS Workshop*:
Sample Inference - inferring the sequence variants in each sample.
By default, the dada function processes each sample independently, but pooled processing is available with pool=TRUE and that may give better results for low sampling depths at the cost of increased computation time.
All samples are simultaneously loaded into memory by default. If the datasets approach or exceed available RAM, it is preferable to process samples one-by-one in a streaming fashion: see DADA2 Workflow on Big Data for an example.
# load("data_sep2019/errF.RData")
# load("data_sep2019/derepFs.RData")
system.time(dadaFs <- dada(derep = derepFs,
err = errF,
multithread = TRUE))
save(dadaFs,
file = "data_sep2019/dadaFs.RData")
# load("data_sep2019/errR.RData")
# load("data_sep2019/derepRs.RData")
system.time(dadaRs <- dada(derep = derepRs,
err = errR,
multithread = TRUE))
save(dadaRs,
file = "data_sep2019/dadaRs.RData")
Merge paired reads
Make a sequence table for chimera removal
NOTE: According to the IGS, denovo chimeras are determined based on most abundant sequencins in a given data. Usually 5-7% of sequences are chimeras. It is much higher in this dataset (60.4%). IGS recommends revisiting the removal of primers, as the ambiguous nucleotides in unremoved primers interfere with chimera identification.
Number of reads per sample throughout processing
# load("data_sep2019/out.RData")
# load("data_sep2019/dadaFs.RData")
# fnFs <- sort(list.files(path,
# pattern="_R1_001.fastq",
# full.names = TRUE))
# sample.names <- gsub(x = fnFs,
# pattern = "fastq_sep2019/",
# replacement = "")
# sample.names <- sapply(strsplit(sample.names, "_"),
# `[`,
# 1)
# sample.names
getN <- function(x) {
sum(getUniques(x))
}
track <- cbind(out,
sapply(dadaFs,
getN),
sapply(mergers,
getN),
rowSums(seqtab),
rowSums(seqtab.nochim))
colnames(track) <- c("Raw",
"Filtered",
"Denoised",
"Merged",
"Tabled",
"Non-Chimeras")
rownames(track) <- sample.names
datatable(format(track,
big.mark = ","),
options = list(pageLength = nrow(track)))
IGS suggests the number of merged sequences can potentially be increased by truncating the reads less (truncLen parameter in the filterAndTrim function), specifically, making sure that the truncated reads span the amplicon. This might not be the case here as the remaining reads are relatively long (280 bases for forward and 220 reads for reverse reads).
Save amplicon sequence variants (ASV) as a FastA file
Write out and save your results thus far:
fc <- file("data_sep2019/all_runs_dada2_ASV.fasta")
fltp <- character()
for( i in 1:ncol(seqtab)) {
fltp <- append(fltp,
paste0(">Seq_",
i))
fltp <- append(fltp,
colnames(seqtab)[i])
}
writeLines(fltp,
fc)
close(fc)
head(fltp)
rm(fltp)
gc()
Assign taxonomy
NOTE: create taxa.RData once, then comment it out and load the R data file to when reruning the code.
# taxa <- assignTaxonomy(seqs = seqtab.nochim,
# refFasta = "tax/silva_nr_v132_train_set.fa",
# multithread = mt)
# save(taxa,
# file = "data_sep2019/taxa.RData")
load("data_sep2019/taxa.RData")
print(paste("Number of unique references =",
format(nrow(taxa),
big.mark = ",")))
datatable(taxa[1:5, ],
rownames = FALSE)
# Keep only the references found in the data
taxa.tmp <- taxa[rownames(taxa) %in% colnames(seqtab.nochim), ]
print(paste("Number of references matched in the data =",
format(nrow(taxa.tmp),
big.mark = ",")))
# # Add species (do it once)
# taxa.plus <- addSpecies(taxtab = taxa.tmp,
# refFasta = "tax/silva_species_assignment_v132.fa",
# verbose = TRUE)
# save(taxa.plus,
# file = "data_may2019/taxa.plus.RData")
#
# load("data_sep2019/taxa.plus.RData")
load("data_sep2019/dt.meta.RData")
dt.otu <- otu_table(seqtab.nochim,
taxa_are_rows = FALSE)
sample_names(dt.otu) <- sample.names
print("Sample names in OTU table")
sample_names(dt.otu)
metadata <- sample_data(dt.meta)
rownames(metadata) <- metadata$SAMPLE_NAME
print("Sample names in metadata")
sample_names(metadata)
metadata@row.names <- sample_names(dt.otu)
ps_sep2019 <- phyloseq(dt.otu,
metadata,
tax_table(taxa))
sample_names(ps_sep2019)
save(ps_sep2019,
file = "data_sep2019/ps_sep2019.RData")
---
title: "Rasika's 16S samples, January 2020 Batch"
output: 
  html_notebook:
    toc: yes
    toc_float: yes
---
Date: `r date()`     
Scientist: [Ran Yin](mailto:ry147@scarletmail.rutgers.edu)      
Sequencing (Waksman): [Dibyendu Kumar](mailto:dk@waksman.rutgers.edu) (?)      
Statistics: [Davit Sargsyan](mailto:sargdavid@gmail.com)      
Principal Investigator: [Ah-Ng Kong](mailto:kongt@pharmacy.rutgers.edu)      

# Sources
## Script
This script was developed using [DADA2 Pipeline Tutorial (1.12)](https://benjjneb.github.io/dada2/tutorial.html) with tips and tricks from the [University of Maryland Shool of Medicine Institute for Genome Sciences (IGS)](http://www.igs.umaryland.edu/) [Microbiome Analysis Workshop (April 8-11, 2019)](http://www.igs.umaryland.edu/education/wkshp_metagenome.php).
  
## Data 
FastQ files were downloaded from [this Rutgers Box location](https://rutgers.app.box.com/folder/90143462291). A total of 144 files (2 per sample, pair-ended) and a pair of undetermined reads were downloaded.  
  
***16s metadata Sep-2019.xlsx*** meta-data file was created by Ran. It was saved as .CSV file and used by this script.  
  
# Load libraries
```{r setup, include = FALSE}
require(knitr)
require(kableExtra)

# # Increase memory size to 64 Gb
# invisible(utils::memory.limit(65536))
options(stringsAsFactors = FALSE)
# str(knitr::opts_chunk$get())
# # NOTE: the below does not work!
# knitr::opts_chunk$set(echo = FALSE, 
#                       message = FALSE,
#                       warning = FALSE,
#                       error = FALSE)

# On Windows set multithread=FALSE
# Otherwise, TRUE or number of cores
# mt <- 30
mt <- TRUE

# # Source: https://benjjneb.github.io/dada2/index.html
# # Installed on J&J Rstudio server  on 05/22/2019
# if (!requireNamespace("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
# BiocManager::install(version = "3.8")
# BiocManager::install("dada2", version = "3.8")
# BiocManager::install("phyloseq", version = "3.8")

# Follow the tutorial:
# https://benjjneb.github.io/dada2/tutorial.html

require(data.table)
require(dada2)
require(phyloseq)
require(ggplot2)
library(stringr)
require(DT)

path <- "fastq_jan2020"
```

# Meta-data
```{r meta_data}
# dt.meta <- fread("data_jan2020/16s metadata Sep-2019.csv")
# save(dt.meta,
#      file = "data_sep2019/dt.meta.RData")
# datatable(dt.meta,
#           caption = "Table 1: Meta-Data",
#           rownames = FALSE,
#           class = "cell-border stripe",
#           options = list(searching = TRUE,
#                          pageLength = 8))
```
  
**NOTE**: moved "Undetermined" samples from the FastQ folder to the docs folder.  
  
# Questions
1. Did microbiome change over time?   
2. Was microbiome affected by diet?  
3. Was microbiome affected by KO compared to WT?  
  
# FastQ files
```{r fastq}
# Get FastQ file names----
list.files(path = path,
           pattern = ".gz")
```

# Quality of reads
In **gray-scale** is a heat map of the frequency of each quality score at each base position. The median quality score at each position is shown by the **green** line, and the quartiles of the quality score distribution by the **orange** lines. The **red** line shows the scaled proportion of reads that extend to at least that position (this is more useful for other sequencing technologies, as Illumina reads are typically all the same lenghth, hence the flat red line).      
Source: [DADA2 Pipeline Tutorial (1.12)](https://benjjneb.github.io/dada2/tutorial.html) 
**NOTE**: the reason the quality seems to be low at the beginning is that the program is using moving averages so there are less data points in the beginning. No trimming is needed on the left.

## Forward reads
```{r plot_quality_fwd, warnings = FALSE, echo = FALSE, message = FALSE, fig.height = 4, fig.width = 5}
fnFs <- sort(list.files(path, 
                        pattern="_R1_001.fastq", 
                        full.names = TRUE))
system.time({
  for (i in 1:length(fnFs)) {
    print(plotQualityProfile(fnFs[i]))
  }
})
```

## Reverse reads
```{r plot_quality_rev, warnings = FALSE, echo = FALSE, message = FALSE, fig.height = 4, fig.width = 5}
fnRs <- sort(list.files(path,
                        pattern="_R2_001.fastq",
                        full.names = TRUE))
system.time({
  for (i in 1:length(fnRs)) {
    print(plotQualityProfile(fnRs[i]))
  }
})
```

# Filter and trim sequences
The reads were trimmed approximately to the lenght at which the quality score median (the **green** line) went below 20.    
The forward reads were of a very good quiality. Only last 20 bases were trimmed.    
The reverse read were of lower quality and were trimmed at the length of 220 bases.  
  
```{r sort_n_trim_prep}
sample.names <- gsub(x = fnFs,
                     pattern = "fastq_jan2020/",
                     replacement = "")
sample.names <- sapply(strsplit(sample.names, "_"),
                       `[`, 
                       1)
sample.names

filtFs <- gsub(x = fnFs,
               pattern = "fastq_jan2020/",
               replacement = "filtered_jan2020/")

filtRs <- gsub(x = fnRs,
               pattern = "fastq_jan2020/",
               replacement = "filtered_jan2020/")
```

```{r sort_n_trim}
out <- filterAndTrim(fwd = fnFs, 
                     filt = filtFs,
                     rev = fnRs, 
                     filt.rev = filtRs, 
                     truncLen = c(280, 220),
                     # trimRight = c(20, 80),
                     maxN = 0, 
                     maxEE = c(2, 2), 
                     truncQ = 2, 
                     rm.phix = TRUE, 
                     compress = TRUE,
                     multithread = FALSE)
# NOTE: multi-tread messes up pairs of the files; using single tread instead.

save(out,
     file = "data_jan2020/out.RData")
gc()
```

# Reads after trimming: examples
```{r trim_rev,warnings=FALSE,echo=FALSE,message=FALSE}
plotQualityProfile(filtFs[1])
plotQualityProfile(filtRs[1])
```

# Learn the error rates
**NOTE**: parameter learning is computationally intensive, so by default the learnErrors function uses only a subset of the data (the first 1M reads). If the plotted error model does not look like a good fit, try increasing the nreads parameter to see if the fit improves.
```{r learn_error_fs}
# fnFs <- sort(list.files(path,
#                         pattern="_R1_001.fastq",
#                         full.names = TRUE))
# filtFs <- gsub(x = fnFs,
#                pattern = "fastq_sep2019/",
#                replacement = "filtered_sep2019/")

system.time(errF <- learnErrors(filtFs, 
                                multithread = FALSE))
save(errF,
     file = "data_jan2020/errF.RData")
```

```{r learn_error_rs}
# fnRs <- sort(list.files(path,
#                         pattern="_R2_001.fastq",
#                         full.names = TRUE))
# filtRs <- gsub(x = fnRs,
#                pattern = "fastq_sep2019/",
#                replacement = "filtered_sep2019/")

system.time(errR <- learnErrors(filtRs, 
                                multithread = FALSE))
save(errR,
     file = "data_sep2019/errR.RData")
```

# Plot learn the error rates
```{r plot_error,warnings=FALSE,echo=FALSE,message=FALSE}
plotErrors(errF, 
           nominalQ = TRUE)
plotErrors(errR, 
           nominalQ = TRUE)
```

# Dereplicate the dataset 
**NOTE**: for larger datasets (exceeding available RAM) process samples one-by-one. See [DADA2 Workflow on Big Data](https://benjjneb.github.io/dada2/bigdata.html).
```{r dereplicate_fs}
# fnFs <- sort(list.files(path, 
#                         pattern="_R1_001.fastq", 
#                         full.names = TRUE))
# filtFs <- gsub(x = fnFs,
#                pattern = "fastq_sep2019/",
#                replacement = "filtered_sep2019/")

system.time(derepFs <- derepFastq(filtFs, 
                                  verbose = TRUE))
save(derepFs,
     file = "data_sep2019/derepFs.RData")

head(derepFs)
gc()
```

```{r dereplicate_rs}
# fnRs <- sort(list.files(path,
#                         pattern="_R2_001.fastq",
#                         full.names = TRUE))
# filtRs <- gsub(x = fnRs,
#                pattern = "fastq_sep2019/",
#                replacement = "filtered_sep2019/")

system.time(derepRs <- derepFastq(filtRs, 
                                  verbose = TRUE))
save(derepRs,
     file = "data_sep2019/derepRs.RData")

head(derepRs)
gc()
```

# Alignment
**Notes from IGS Workshop***:    
Sample Inference - inferring the sequence variants in each sample.     
      
By default, the ***dada*** function processes each sample independently, but pooled processing is available with ***pool=TRUE*** and that may give better results for low sampling depths at the cost of increased computation time.     
     
All samples are simultaneously loaded into memory by default. If the datasets approach or exceed available RAM, it is preferable to process samples one-by-one in a streaming fashion: see [DADA2 Workflow on Big Data](https://benjjneb.github.io/dada2/bigdata.html) for an example.    
```{r dada_fs}
# load("data_sep2019/errF.RData")
# load("data_sep2019/derepFs.RData")

system.time(dadaFs <- dada(derep = derepFs, 
                           err = errF,
                           multithread = TRUE))
save(dadaFs,
     file = "data_sep2019/dadaFs.RData")
```

```{r dada_rs}
# load("data_sep2019/errR.RData")
# load("data_sep2019/derepRs.RData")

system.time(dadaRs <- dada(derep = derepRs, 
                           err = errR,
                           multithread = TRUE))
save(dadaRs,
     file = "data_sep2019/dadaRs.RData")
```

# Merge paired reads
```{r merge,warnings=FALSE,echo=FALSE,message=FALSE,eval=TRUE}
# load("data_sep2019/dadaFs.RData")
# load("data_sep2019/derepFs.RData")
# load("data_sep2019/dadaRs.RData")
# load("data_sep2019/derepRs.RData")

system.time(mergers <- mergePairs(dadaF = dadaFs,
                                  derepF = derepFs,
                                  dadaR = dadaRs, 
                                  derepR = derepRs,
                                  verbose = TRUE))
save(mergers,
     file = "data_sep2019/mergers.RData")
```

# Make a sequence table for chimera removal
```{r chimera,warnings=FALSE,echo=FALSE,message=FALSE,eval=TRUE}
# load("data_sep2019/mergers.RData")

system.time(seqtab <- makeSequenceTable(mergers))

dim(seqtab)
save(seqtab,
     file = "data_sep2019/seqtab.RData")
gc()

# Remove chimeras
system.time(seqtab.nochim <- removeBimeraDenovo(unqs = seqtab,
                                                method = "consensus",
                                                multithread = mt,
                                                verbose = TRUE))

dim(seqtab.nochim) 
save(seqtab.nochim,
     file = "data_may2019/seqtab.nochim.RData")
write.csv(seqtab.nochim, 
          file = "data_may2019/seqtab.nochim.csv", 
          quote = FALSE)

# 1 - proportion of chimeras
print(paste("Chimeras = ",
            round(100*(1 - (sum(seqtab.nochim)/sum(seqtab))), 1),
            "%",
            sep = ""))
```

**NOTE**: According to the IGS, denovo chimeras are determined based on most abundant sequencins in a given data. Usually 5-7% of sequences are chimeras. It is much higher in this dataset (60.4%). IGS recommends revisiting the removal of primers, as the ambiguous nucleotides in unremoved primers interfere with chimera identification. 

## Number of reads per sample throughout processing
```{r check_length}
# load("data_sep2019/out.RData")
# load("data_sep2019/dadaFs.RData")
# fnFs <- sort(list.files(path,
#                         pattern="_R1_001.fastq",
#                         full.names = TRUE))
# sample.names <- gsub(x = fnFs,
#                      pattern = "fastq_sep2019/",
#                      replacement = "")
# sample.names <- sapply(strsplit(sample.names, "_"),
#                        `[`, 
#                        1)
# sample.names

getN <- function(x) {
  sum(getUniques(x))
} 
track <- cbind(out, 
               sapply(dadaFs, 
                      getN),
               sapply(mergers,
                      getN),
               rowSums(seqtab), 
               rowSums(seqtab.nochim))
colnames(track) <- c("Raw", 
                     "Filtered",
                     "Denoised", 
                     "Merged",
                     "Tabled",
                     "Non-Chimeras")
rownames(track) <- sample.names
datatable(format(track,
                 big.mark = ","),
          options = list(pageLength = nrow(track)))
```

IGS suggests the number of **merged** sequences can potentially be increased by truncating the reads less (***truncLen*** parameter in the ***filterAndTrim*** function), specifically, making sure that the truncated reads span the amplicon. This might not be the case here as the remaining reads are relatively long (280 bases for forward and 220 reads for reverse reads).

# Save amplicon sequence variants (ASV) as a FastA file
Write out and save your results thus far: 
```{r save_fasta_asv}
fc <- file("data_sep2019/all_runs_dada2_ASV.fasta")
fltp <- character()
for( i in 1:ncol(seqtab)) {
  fltp <- append(fltp, 
                 paste0(">Seq_", 
                        i))
  fltp <- append(fltp,
                 colnames(seqtab)[i])
}
writeLines(fltp, 
           fc)
close(fc)
head(fltp)
rm(fltp)
gc()
```

# Assign taxonomy
**NOTE**: create ***taxa.RData*** once, then comment it out and load the R data file to when reruning the code. 
```{r tax_assign}
# taxa <- assignTaxonomy(seqs = seqtab.nochim,
#                        refFasta = "tax/silva_nr_v132_train_set.fa",
#                        multithread = mt)
# save(taxa,
#      file = "data_sep2019/taxa.RData")

load("data_sep2019/taxa.RData")
print(paste("Number of unique references =",
            format(nrow(taxa),
                   big.mark = ",")))

datatable(taxa[1:5, ],
          rownames = FALSE)

# Keep only the references found in the data
taxa.tmp <- taxa[rownames(taxa) %in% colnames(seqtab.nochim), ]
print(paste("Number of references matched in the data =",
            format(nrow(taxa.tmp),
                   big.mark = ",")))

# # Add species (do it once)
# taxa.plus <- addSpecies(taxtab = taxa.tmp,
#                         refFasta = "tax/silva_species_assignment_v132.fa",
#                         verbose = TRUE)
# save(taxa.plus,
#      file = "data_may2019/taxa.plus.RData")
# 
# load("data_sep2019/taxa.plus.RData")
```

```{r phyloseq}
load("data_sep2019/dt.meta.RData")
dt.otu <- otu_table(seqtab.nochim, 
                    taxa_are_rows = FALSE)
sample_names(dt.otu) <- sample.names
print("Sample names in OTU table")
sample_names(dt.otu)

metadata <- sample_data(dt.meta)
rownames(metadata) <- metadata$SAMPLE_NAME
print("Sample names in metadata")
sample_names(metadata)

metadata@row.names <- sample_names(dt.otu)

ps_sep2019 <- phyloseq(dt.otu, 
                       metadata,
                       tax_table(taxa))
sample_names(ps_sep2019)
save(ps_sep2019,
     file = "data_sep2019/ps_sep2019.RData")
```

# Session Information
```{r info,eval=TRUE}
sessionInfo()
```